{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#本章需导入的模块\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "plt.rcParams['font.sans-serif']=['SimHei']  #解决中文显示乱码问题\n",
    "plt.rcParams['axes.unicode_minus']=False\n",
    "import warnings\n",
    "warnings.filterwarnings(action = 'ignore')\n",
    "from sklearn.metrics import confusion_matrix,f1_score,roc_curve, auc, precision_recall_curve,accuracy_score\n",
    "from sklearn.model_selection import train_test_split,KFold,LeaveOneOut,LeavePOut # 数据集划分方法\n",
    "from sklearn.model_selection import cross_val_score,cross_validate # 计算交叉验证下的测试误差\n",
    "from sklearn import preprocessing\n",
    "import sklearn.linear_model as LM\n",
    "from sklearn import neighbors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "旁置法的训练集:(140, 5) ；测试集：(60, 5)\n",
      "留一法训练集的样本量：199；测试集的样本量: 1\n",
      "留p法训练集的样本量：197；测试集的样本量: 3\n",
      "5折交叉验证法的训练集： [  0   1   2   3   5   6   7   8   9  10  11  12  13  14  15  16  17  18\n",
      "  21  22  23  24  25  27  28  29  30  32  33  34  35  36  38  39  40  41\n",
      "  42  43  44  45  46  47  48  49  51  54  55  56  57  58  59  60  61  62\n",
      "  63  64  65  66  67  68  69  70  71  73  74  75  76  77  78  80  81  83\n",
      "  84  86  87  89  90  91  92  94  96  97  98  99 100 101 102 103 105 106\n",
      " 107 109 110 111 112 113 114 115 116 117 118 120 122 123 124 125 126 129\n",
      " 130 131 132 134 135 136 137 138 141 142 143 145 146 147 148 150 151 152\n",
      " 153 154 155 156 157 159 160 161 163 164 165 167 168 169 171 173 174 175\n",
      " 176 177 181 186 187 188 190 191 192 193 194 195 196 197 198 199] \n",
      "测试集： [  4  19  20  26  31  37  50  52  53  72  79  82  85  88  93  95 104 108\n",
      " 119 121 127 128 133 139 140 144 149 158 162 166 170 172 178 179 180 182\n",
      " 183 184 185 189]\n"
     ]
    }
   ],
   "source": [
    "np.random.seed(123)\n",
    "N=200\n",
    "x=np.linspace(0.1,10, num=N)\n",
    "y=[]\n",
    "z=[]\n",
    "for i in range(N):\n",
    "    tmp=10*np.math.sin(4*x[i])+10*x[i]+20*np.math.log(x[i])+30*np.math.cos(x[i])\n",
    "    y.append(tmp)\n",
    "    tmp=y[i]+np.random.normal(0,3)\n",
    "    z.append(tmp)\n",
    "X=x.reshape(N,1)\n",
    "Y=np.array(z)\n",
    "for i in np.arange(1,5):  #采用5项式模型\n",
    "    tmp=pow(x,(i+1)).reshape(N,1)\n",
    "    X=np.hstack((X,tmp))\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.70, random_state=123)   #旁置法\n",
    "print(\"旁置法的训练集:%s ；测试集：%s\" % (X_train.shape,X_test.shape))   \n",
    "loo = LeaveOneOut()  #留一法\n",
    "for train_index, test_index in loo.split(X):\n",
    "    print(\"留一法训练集的样本量：%s；测试集的样本量: %s\" % (len(train_index), len(test_index)))\n",
    "    break\n",
    "lpo = LeavePOut(p=3)  # 留p法\n",
    "for train_index, test_index in lpo.split(X):\n",
    "    print(\"留p法训练集的样本量：%s；测试集的样本量: %s\" % (len(train_index),len(test_index)))   \n",
    "    break\n",
    "kf = KFold(n_splits=5,shuffle=True,random_state=123)  # K折交叉验证法\n",
    "for train_index, test_index in kf.split(X):   #给出索引\n",
    "    print(\"5折交叉验证法的训练集：\",train_index,\"\\n测试集：\",test_index)\n",
    "    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "说明： 1、这里利用前述模拟数据，考察三种数据集划分及其测试误差的计算结果。需要引用sklearn.model_selection中的相关函数。 2、train_test_split(X,Y,train_size=0.70, random_state=123)实现数据集（X为输入变量矩阵，包含5个输入变量。输出变量为Y)划分的旁置法，这里指定训练集占原数据集（样本量为200)的70%，剩余30%为测试集。可指定random_state为任意整数以确保数据集的随机划分结果可以重现。函数依次返回训练集和测试集的输入变量和输出变量。可以看到划分结果为：训练集:(140, 5) ；测试集：(60, 5）。 3、LeaveOneOut()实现留一法，可利用结果对象的split方法，浏览数据集的划分结果，即训练集和测试集的样本观测索引（编号）。因原数据集样本量为200，留一法将做200次训练集和测试集的轮换，可利用循环浏览每次的划分结果。这里利用break跳出循环，只看第1次的划分结果。LeavePOut(p=3)是对留一法的拓展，为留p法。例如这里测试集的样本量为3。 4、KFold(n_splits=5,shuffle=True,random_state=123)实现K折交叉验证法，这里为k=5。指定shuffle=True表示将数据顺序随机打乱后再做K折划分。这里仅显示了1次划分的结果（样本观测索引）。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k=10折交叉验证的MSE: 103.51193551109296\n",
      "k=10折交叉验证的MSE: 103.51193551109296\n",
      "LOO的MSE: 56.840587762694724\n"
     ]
    }
   ],
   "source": [
    "modelLR=LM.LinearRegression()\n",
    "k=10\n",
    "CVscore=cross_val_score(modelLR,X,Y,cv=k,scoring='neg_mean_squared_error')   #sklearn.metrics.SCORERS.keys()\n",
    "print(\"k=10折交叉验证的MSE:\",-1*CVscore.mean())\n",
    "scores = cross_validate(modelLR,X,Y, scoring='neg_mean_squared_error',cv=k, return_train_score=True)\n",
    "print(\"k=10折交叉验证的MSE:\",-1*scores['test_score'].mean())  # scores为字典\n",
    "\n",
    "#N折交叉验证:LOO\n",
    "CVscore=cross_val_score(modelLR,X,Y,cv=N,scoring='neg_mean_squared_error')   \n",
    "print(\"LOO的MSE:\",-1*CVscore.mean())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "说明：cross_val_score()和cross_validate()都可自动给出模型在K折交叉验证法下的测试误差，cross_validate还可给出训练误差。这里，模型为一般线性回归模型（五项式模型），计算10折交叉验证法下的测试误差。参数scoring为模型预测精度的度量，指定特定字符串，计算相应评价指标的结果。例如：'neg_mean_squared_error'表示计算负的MSE。可通过sklearn.metrics.SCORERS.keys()浏览其他的评价指标。若指定参数cv等于样本量，可得到N折交叉验证法（即留一法LOO）下的测试误差。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
